import pandas as pd

# Import data
df = pd.read_excel('../original/Sample Hand-to-Eye Count Survey_President and Vice President of the United States Contest_2020.xlsx')

# Add necessary columns
df['state'] = 'NORTH CAROLINA'
df['date'] = '2020-11-03'
df['office']='US PRESIDENT'

# Rename necessary columns
df.rename(columns={'Name of County': 'county'}, inplace = True)
df.rename(columns={'Name of Sample Precinct?': 'precinct'}, inplace = True)


# Make all county names upper case
df['county'] = df['county'].str.upper()

# Remove "#" signs in variable "precinct"
df['precinct'] = df['precinct'].str.replace('#','')

# Drop some disruptive tabs
df.loc[df["precinct"] == "CUMBERLAND	ONE-STOP	K	WESTOVER RECREATION CENTER", "precinct"] = "CUMBERLAND ONE-STOP K WESTOVER RECREATION CENTER"

# Rename original and audited votes column names
df.rename(columns={'Machine Count': 'original_votes_1'}, inplace = True)
df.rename(columns={'Hand-to-eye count': 'audited_votes_1'}, inplace = True)
for i in range(2,7):
    df.rename(columns={'Machine Count'+str(i): 'original_votes_'+str(i)}, inplace = True)
    df.rename(columns={'Hand-to-eye count'+str(i): 'audited_votes_'+str(i)}, inplace = True)

# Remove an extra comma used in one of the columns and cast the entry as integer
df['original_votes_1'] = df['original_votes_1'].str.replace(',','')
df = df.astype({'original_votes_1': 'int64'})

   

# Drop unnecessary columns
# Using numbers of columns to get their names more easily
# cols = df.columns[0:65]
# for i in range(65):
#     print(str(i) +'   ' + cols[i])
# cols = [cols[index] for index in [56, 57, 58, 5,8,11,12,17,18,23,24,29,30,35,36,41,42,59,60,61,62,63,64]]
# print(cols)
df = df[['state', 'date', 'office', 'county', 'precinct', 'original_votes_1', 'audited_votes_1', 'original_votes_2', 'audited_votes_2', 'original_votes_3', 'audited_votes_3', 'original_votes_4', 'audited_votes_4', 'original_votes_5', 'audited_votes_5', 'original_votes_6', 'audited_votes_6']]


# Move candidates from columns to rows

df['Trump'] = list(zip(df['original_votes_1'], df['audited_votes_1']))
df['Biden'] = list(zip(df['original_votes_2'], df['audited_votes_2']))
df['Blankenship'] = list(zip(df['original_votes_3'], df['audited_votes_3']))
df['Hawkins'] = list(zip(df['original_votes_4'], df['audited_votes_4']))
df['Jorgensen'] = list(zip(df['original_votes_5'], df['audited_votes_5']))
df['write-in'] = list(zip(df['original_votes_6'], df['audited_votes_6']))

df = df.melt(['state', 'date', 'office', 'county', 'precinct'],['Trump', 'Biden', 'Blankenship', 'Hawkins', 'Jorgensen', 'write-in'], var_name = 'candidate')

df[['original_votes', 'audited_votes']] = pd.DataFrame(df['value'].tolist(), index=df.index)
df.drop('value', axis=1, inplace = True)


# Calculate differences between original and audited votes
df['diff']=df['audited_votes']-df['original_votes']

# Sort
df.sort_values(['county', 'precinct'], inplace=True)

#Drop write-ins
df = df.drop(df[df["candidate"] == "write-in"].index)

# Save the work
df.to_csv('../ready/North Carolina cleaned.csv', index=False)
